This section and the next are relevant for reproducibility purposes. For results, please skip to section 3 (Quality Control)
suppressPackageStartupMessages({
library(data.table)
library(DESeq2)
library(emoji)
library(gplots)
library(gtools)
library(here)
library(hyperSpec)
library(limma)
library(magrittr)
library(parallel)
library(patchwork)
library(PCAtools)
library(pheatmap)
library(plotly)
library(pvclust)
library(RColorBrewer)
library(tidyverse)
library(tximport)
library(vsn)
})samples <- read_csv(here("doc/pine.csv"),
col_types=cols(.default=col_factor())) %>%
separate_wider_delim(SampleDescription,delim = " - ",names = c("EcotypeLatitude","LightCondition","Replicate")) %>%
mutate(EcotypeLatitude = gsub(" (latitude)","",gsub("Pine ecotype ","P",EcotypeLatitude),fixed = T),
LightCondition = gsub("light condition ","",LightCondition),
Replicate = gsub("replicate ","",Replicate))tx2gene <- suppressMessages(read_delim(here("analysis/pine/nfcoreResult/salmon/tx2gene.tsv"),delim="\t",
col_names=c("TXID","GENE"))) %>%
select(-X3)filelist <- data.frame(Filenames = list.files(here("analysis/pine/nfcoreResult/salmon"),
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)) %>%
mutate(SampleID = basename(dirname(Filenames)))# stopifnot(all(match(sub("_CHANGE-ME.*","",basename(dirname(filelist))),
# samples$SampleID) == 1:nrow(samples)))Read the expression at the gene level
txi <- suppressMessages(tximport(files = samples$Filenames,
type = "salmon",
tx2gene=tx2gene))
counts <- txi$counts
colnames(counts) <- samples$SampleID
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(counts),digits=1),
sum(sel),
nrow(counts))## [1] "11.1% percent (6557) of 58969 genes are not expressed"
dat <- tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(samples)
ggplot(dat,aes(x,y,fill=EcotypeLatitude)) +
geom_col() +
scale_y_continuous(name="reads") +
facet_grid(~ factor(LightCondition), scales = "free") +
theme_bw() +
theme(axis.text.x=element_text(angle=90,size=8),
axis.title.x=element_blank())👉 The raw sequencing depth of replicate 4 (B-P67-4 and D-P56-4) are significantly lower than other replicates.
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
ggplot(data.frame(value=log10(rowMeans(counts))),aes(x=value)) +
geom_density(na.rm = TRUE) +
ggtitle("gene mean raw counts distribution") +
scale_x_continuous(name="mean raw counts (log10)") +
theme_bw()👉 The cumulative gene coverage is as expected
dat <- as.data.frame(log10(counts)) %>%
utils::stack() %>%
mutate(LightCondition=samples$LightCondition[match(ind,samples$SampleID)],
Ecotype=samples$EcotypeLatitude[match(ind,samples$SampleID)],
Replicate=samples$Replicate[match(ind,samples$SampleID)])
ggplot(dat,aes(x=values,group=ind,col=LightCondition)) +
geom_density(na.rm = TRUE) +
ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)") +
theme_bw()ggplot(dat,aes(x=values,group=ind,col=Ecotype)) +
geom_density(na.rm = TRUE) +
ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)") +
theme_bw()ggplot(dat,aes(x=values,group=ind,col=Replicate)) +
geom_density(na.rm = TRUE) +
ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)") +
theme_bw()👉 Replicate 4 has different sequencing depth characteristics from other replicates.
# dir.create(here("data/analysis/salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file=here("analysis/pine/raw-unormalised-gene-expression_data.csv"))
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate.
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in design formula are characters, converting to factors
## using counts and average transcript lengths from tximport
(i.e. the sequencing library size effect)
dds <- estimateSizeFactors(dds) %>%
suppressMessages()
boxplot(normalizationFactors(dds),
main="Sequencing libraries size factor",
las=2,log="y")
abline(h=1, col = "Red", lty = 3)and without outliers:
boxplot(normalizationFactors(dds),
main="Sequencing libraries size factor",
las=2,log="y", outline=FALSE)
abline(h=1, col = "Red", lty = 3)👉 The libraries’ size factors of replicate 4 are significantly lower than other replicates.
Assess whether there might be a difference in library size linked to a given metadata
boxplot(split(t(normalizationFactors(dds)),dds$Replicate),las=2,
main="Sequencing libraries size factor by Replicate",
outline=FALSE)boxplot(split(t(normalizationFactors(dds[,!dds$Replicate == 4])),dds[,!dds$Replicate == 4]$EcotypeLatitude),las=2,
main="Sequencing libraries size factor by Ecotype in replicate 1-3",
outline=FALSE)boxplot(split(t(normalizationFactors(dds[,!dds$Replicate == 4])),dds[,!dds$Replicate == 4]$LightCondition),las=2,
main="Sequencing libraries size factor by Light condition in replicate 1-3",
outline=FALSE)👉 The scaling factor distribution is dependent of replicate (only 4), but not dependent of other variables.
plot(colMeans(normalizationFactors(dds)),
log10(colSums(counts(dds))),ylab="log10 raw depth",
xlab="average scaling factor",
col=rainbow(n=4)[as.integer(dds$Replicate)],pch=19)
legend("bottomright",fill=rainbow(n=4),
legend=unique(dds$Replicate),cex=0.6)👉 The scaling factor appear linearly proportional to the sequencing depth in all replicates.
let’s look at standard deviations before and after VST normalization. This plot is to see whether there is a dependency of SD on the mean.
Before:
After:
After VST normalization, the red line is almost horizontal which indicates no dependency of variance on mean (homoscedastic).
👉 We can conclude that the variance stabilization worked adequately
Using PCAtools
We define the number of variable of the model:
An the number of possible combinations
We devise the optimal number of components using two methods
We plot the percentage explained by different components and try to empirically assess whether the observed number of components would be in agreement with our model’s assumptions.
ggplot(tibble(x=1:length(percent),y=cumsum(percent),p=percent),aes(x=x,y=y)) +
geom_line() + geom_col(aes(x,p)) + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
scale_x_continuous("Principal component",breaks=1:length(percent),minor_breaks=NULL) +
geom_vline(xintercept=nvar,colour="red",linetype="dashed",linewidth=0.5) +
geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",linewidth=0.5) +
geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",linewidth=0.5) +
geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",linewidth=0.5) +
geom_vline(xintercept=c(horn$n,elbow),colour="black",linetype="dotted",linewidth=0.5) +
geom_label(aes(x = horn$n + 1, y = cumsum(percent)[horn$n],label = 'Horn', vjust = 1)) +
geom_label(aes(x = elbow + 1, y = cumsum(percent)[elbow],label = 'Elbow', vjust = 1))## Warning in geom_label(aes(x = horn$n + 1, y = cumsum(percent)[horn$n], label = "Horn", : All aesthetics have length 1, but the data has 18 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing a single row.
## Warning in geom_label(aes(x = elbow + 1, y = cumsum(percent)[elbow], label = "Elbow", : All aesthetics have length 1, but the data has 18 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing a single row.
## Warning: Removed 2 rows containing missing values or values outside the scale range (`geom_line()`).
👉 The first component explains 40% of the data variance. Both metrics, Horn and Elbow suggest that one or two components are those that are informative. Indeed the slope of the curve is fairly linear past PC3 and that would indicate that the remaining PCs only capture sample specific noise. While this is only empirical, the scree plot support having only few variables of importance in the dataset.
p1 <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=LightCondition,shape=EcotypeLatitude,text=SampleID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(p1) %>%
layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC2 (",percent[2],"%)",sep=""))) %>% suppressWarnings()The same as a biplot
biplot(p,
colby = 'LightCondition',
colLegendTitle = 'LightCondition',
encircle = TRUE,
encircleFill = TRUE,
legendPosition = 'top',
legendLabSize = 16, legendIconSize = 8.0)biplot(p,
colby = 'EcotypeLatitude',
colLegendTitle = 'EcotypeLatitude',
encircle = TRUE,
encircleFill = TRUE,
legendPosition = 'top',
legendLabSize = 16, legendIconSize = 8.0)👉 PC1 (37%) explains light condition. PC2 (11%) explains the outliers of each light condition group. The outliers are from totally different ecotypes, light conditions, and replicates.
p2 <- ggplot(pc.dat,aes(x=PC1,y=PC3,col=LightCondition,shape=EcotypeLatitude,text=SampleID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(p2) %>%
layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC3 (",percent[3],"%)",sep=""))) %>% suppressWarnings()The same as a biplot
biplot(p,x = 'PC1', y = 'PC3',
colby = 'LightCondition',
colLegendTitle = 'LightCondition',
encircle = TRUE,
encircleFill = TRUE,
legendPosition = 'top',
legendLabSize = 16, legendIconSize = 8.0)biplot(p,x = 'PC1', y = 'PC3',
colby = 'EcotypeLatitude',
colLegendTitle = 'EcotypeLatitude',
encircle = TRUE,
encircleFill = TRUE,
legendPosition = 'top',
legendLabSize = 16, legendIconSize = 8.0)👉 PC1 (37%) explains light condition. PC2 (11%) and PC3 (6%) explain the outliers of each light condition group. The outliers are from totally different ecotypes, light conditions, and replicates.
This allows for looking at more dimensions, five by default
Loadings, i.e. the individual effect of every gene in a component can be studied. Here the most important ones are visualized throughout the different PCs
plotloadings(p,
rangeRetain = 0.01,
labSize = 4.0,
title = 'Loadings plot',
subtitle = 'PC1 to PC5',
caption = 'Top 1% variables',
drawConnectors = TRUE)## -- variables retained:
## PS_chr01_G003393, PS_chr08_G035125, PS_chr04_G017440, PS_sUP0888_G056782, PS_chr02_G009805, PS_sUP2859_G058052, PS_chr10_G044546, PS_chr08_G033170, PS_chr10_G043389, PS_chr02_G005503, PS_chr02_G005502
## Warning: ggrepel: 23 unlabeled data points (too many overlaps). Consider increasing max.overlaps
This is a plot showing the correlation between the PC and the model variables. Note that while this might be relevant for a linear variable, it is less so for categorical variables. Sorting categorical variables in a linear order according to the PCs above might help.
Plotting only the relevant variables.
👉 PC1 and PC4 explain light conditions. PC5 explains ecotypes.
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- colnames(sampleDistMatrix) <- dds$SampleID
hm <- pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=pal)
hm👉 B clusters together well. A and D are also well separated except for a pair of A and D samples, which are the ones driving PC2, that look different.
The figures show the number of genes expressed per condition at different expression cutoffs. The scale on the lower plot is the same as on the upper. The first plot is a heatmap showing the number of genes above a given cutoff. The second plot shows it as a ratio of the number of genes expressed for (a) given variable(s) divided by the average number of genes expressed at that cutoff across all variable(s). The latter plot is of course biased at higher cutoff as the number of genes becomes smaller and smaller. The point of these two plots is to assert whether the number of genes expressed varies between conditions, as this would break some assumptions for normalisation and differential expression.
conds <- factor(paste(dds$LightCondition,dds$EcotypeLatitude))
dev.null <- rangeSamplesSummary(counts=vst,
conditions=conds,
nrep=3)Plotting the number of genes that are expressed (at any level)
(nrow(vst) - colSums(vst==0)) %>% as.data.frame() %>%
rownames_to_column("SampleID") %>%
rename("ExpressedGenes"=".") %>%
left_join(samples,by="SampleID") %>%
ggplot(aes(x=LightCondition, y=ExpressedGenes, fill=EcotypeLatitude)) +
geom_dotplot(binaxis = "y", stackdir = "center")## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
👉 For D, number of genes are quite varied, and seems to be affected by ecotype. The number of genes expressed in other light conditions are not affected by ecotype.
Here we want to visualise all the informative genes as a heatmap. We first filter the genes to remove those below the selected noise/signal cutoff. The method employed here is naive, and relies on observing a sharp decrease in the number of genes within the first few low level of expression. Using an independent filtering method, such as implemented in DESeq2 would be more accurate, but for the purpose of QA validation, a naive approach is sufficient. Note that a sweet spot for computation is around 20 thousand genes, as building the hierarchical clustering for the heatmap scales non-linearly.
👉 Here a cutoff of 1 is applied
⚠️ 54.6% (32170) of total 58969 genes are plotted below:
mat <- t(scale(t(vst[sels[[vst.cutoff+1]],])))
hm <- pheatmap(mat,
color = hpal,
border_color = NA,
clustering_distance_rows = "correlation",
clustering_method = "ward.D2",
show_rownames = FALSE,
#labels_col = conds,
angle_col = 90,
legend = FALSE)
hm👉 By selecting only informative genes, the expression seems to group well by light conditions where D is separated. A and B are close to each other but still remain as separate clades. We can still see the outliers of A and D here.
Below we assess the previous dendrogram’s reproducibility and plot the clusters with au and bp where:
⚠️Clusters with AU larger than 95% are highlighted by rectangles, which are strongly supported by data
hm.pvclust <- pvclust(data = t(scale(t(vst[sels[[vst.cutoff+1]],]))),
method.hclust = "ward.D2",
nboot = 100, parallel = TRUE)## Creating a temporary cluster...done:
## socket cluster with 15 nodes on host 'localhost'
## Multiscale bootstrap... Done.
👉 By selecting only informative genes, samples are grouped by light conditions where D is separated. A and B are close to each other but still remain as separate clades.
##
## Cluster method: ward.D2
## Distance : correlation
##
## Estimates on edges:
##
## si au bp se.si se.au se.bp v c pchi
## 1 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 2 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 3 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 4 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 5 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 6 0.626 0.782 0.884 0.120 0.091 0.011 -0.986 -0.206 0.497
## 7 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 8 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 9 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 10 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 11 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 12 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 13 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 14 0.815 0.921 0.848 0.077 0.042 0.012 -1.222 0.193 0.980
## 15 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 16 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 17 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## R version 4.4.2 (2024-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.5 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Stockholm
## tzcode source: system (glibc)
##
## attached base packages:
## [1] parallel grid stats4 stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] vsn_3.74.0 tximport_1.34.0 RColorBrewer_1.1-3 pvclust_2.2-0 plotly_4.10.4 pheatmap_1.0.12
## [7] PCAtools_2.18.0 ggrepel_0.9.6 patchwork_1.3.0 magrittr_2.0.3 limma_3.62.1 hyperSpec_0.100.2
## [13] xml2_1.3.6 lattice_0.22-6 here_1.0.1 gtools_3.9.5 gplots_3.2.0 emoji_16.0.0
## [19] DESeq2_1.46.0 SummarizedExperiment_1.36.0 Biobase_2.66.0 MatrixGenerics_1.18.0 matrixStats_1.4.1 GenomicRanges_1.58.0
## [25] GenomeInfoDb_1.42.0 IRanges_2.40.0 S4Vectors_0.44.0 BiocGenerics_0.52.0 data.table_1.16.2 lubridate_1.9.3
## [31] forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2 readr_2.1.5 tidyr_1.3.1
## [37] tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] rstudioapi_0.17.1 jsonlite_1.8.9 farver_2.1.2 rmarkdown_2.29 zlibbioc_1.52.0 vctrs_0.6.5
## [7] DelayedMatrixStats_1.28.0 htmltools_0.5.8.1 S4Arrays_1.6.0 SparseArray_1.6.0 proj4_1.0-14 sass_0.4.9
## [13] KernSmooth_2.23-24 bslib_0.8.0 htmlwidgets_1.6.4 plyr_1.8.9 testthat_3.2.1.1 cachem_1.1.0
## [19] ggalt_0.4.0 lifecycle_1.0.4 pkgconfig_2.0.3 rsvd_1.0.5 Matrix_1.7-1 R6_2.5.1
## [25] fastmap_1.2.0 GenomeInfoDbData_1.2.13 digest_0.6.37 colorspace_2.1-1 rprojroot_2.0.4 dqrng_0.4.1
## [31] irlba_2.3.5.1 crosstalk_1.2.1 beachmat_2.22.0 labeling_0.4.3 fansi_1.0.6 timechange_0.3.0
## [37] httr_1.4.7 abind_1.4-8 compiler_4.4.0 bit64_4.5.2 withr_3.0.2 BiocParallel_1.40.0
## [43] hexbin_1.28.4 Rttf2pt1_1.3.12 maps_3.4.2 MASS_7.3-61 DelayedArray_0.32.0 caTools_1.18.3
## [49] tools_4.4.0 extrafontdb_1.0 glue_1.8.0 reshape2_1.4.4 generics_0.1.3 gtable_0.3.6
## [55] tzdb_0.4.0 preprocessCore_1.68.0 hms_1.1.3 BiocSingular_1.22.0 ScaledMatrix_1.14.0 utf8_1.2.4
## [61] XVector_0.46.0 pillar_1.9.0 vroom_1.6.5 bit_4.5.0 deldir_2.0-4 tidyselect_1.2.1
## [67] locfit_1.5-9.10 knitr_1.49 xfun_0.49 statmod_1.5.0 brio_1.1.5 stringi_1.8.4
## [73] UCSC.utils_1.2.0 lazyeval_0.2.2 yaml_2.3.10 evaluate_1.0.1 codetools_0.2-20 interp_1.1-6
## [79] extrafont_0.19 BiocManager_1.30.25 cli_3.6.3 affyio_1.76.0 ash_1.0-15 munsell_0.5.1
## [85] jquerylib_0.1.4 Rcpp_1.0.13-1 png_0.1-8 latticeExtra_0.6-30 jpeg_0.1-10 sparseMatrixStats_1.18.0
## [91] bitops_1.0-9 viridisLite_0.4.2 scales_1.3.0 affy_1.84.0 crayon_1.5.3 rlang_1.1.4
## [97] cowplot_1.1.3
Created by Fai Theerarat Kochakarn